import sys

print "This script is used for extracting (queryNumber term docID) pair from the final training file"
inputFileName = "/data1/team/weijiang/machine-learning-project-related/featuresGenerated/finalTrainingFiles/feature_file_machine_generated.arff"
inputFileHandler = open(inputFileName, "r")


outputFileName = "/data1/team/weijiang/machine-learning-project-related/featuresGenerated/auxFiles/queryID_term_docID_pair"
outputFileHandler = open(outputFileName, "w")

counter = 0
while len(inputFileHandler.readline().split(",")) != 24:
    counter += 1
    pass

print "# of meta data lines:",counter
counter = 0
#print len(inputFileHandler.readlines())

#print len(inputFileHandler.readlines())

for currentLine in inputFileHandler.readlines():
    currentLineElements = currentLine.strip().split(",")
    counter += 1
    
    #print currentLineElements
    
    
    for currentQueryTerm in currentLineElements[3][1:-1].split(" "):
        #print currentLineElements[1],currentQueryTerm,currentLineElements[0]
        outputFileHandler.write(currentLineElements[1] + " " + currentQueryTerm + " " + currentLineElements[0] + "\n")
    
        
    if len(currentLineElements) != 24:
        print "Problem"
        sys.exit(1)

print "# of data lines:",counter
outputFileHandler.close()